[Day12] 文本/詞表示方式(三)-TFIDF實作

2021 iThome 鐵人賽

DAY 12

自我挑戰組

30天初步了解自然語言處理-自學筆記系列第 12 篇

13th鐵人賽 tfidf

eating

2021-09-12 15:43:47

1530 瀏覽

分享至

一. 序

這篇主要用python實作TFIDF，來表示文本的方式

二. 載入套件與文本

主要利用jieba斷詞

import jieba
import math

# 載入繁體
jieba.set_dictionary('dict.txt.big')

# 來自ithome的文章摘要，來源: https://www.ithome.com.tw/news/146142
text_a = '從GPT-3衍生改良而來的Codex模型，能夠將使用者的自然語言指令轉換為程式碼，OpenAI現在以私人測試的方式釋出CodexAPI'
# 來自ithome的文章摘要，來源: https://www.ithome.com.tw/news/145743
text_b = 'Blender2.0除了能即時搜尋網路資訊，臉書也為其打造新的神經模組，可根據之前使用者與它的聊天脈絡來累積記憶'

斷詞與計算每個詞出現的次數

texta_seg = jieba.lcut(text_a)
textb_seg = jieba.lcut(text_b)
unique_words = set(texta_seg).union(set(textb_seg)) ##所有文件中的單詞

# 建立2個新字典，分別存2篇文章詞的出現次數
num_words_a = dict.fromkeys(unique_words, 0)
num_words_b = dict.fromkeys(unique_words, 0)

for word in texta_seg:
    num_words_a[word] += 1
    
for word in textb_seg:
    num_words_b[word] += 1

num_words_a 就是text_a 有出現的詞並且其出現的次數

三. 實作TF與IDF的function

def get_TF_value(w_dict, text_seg_len):
    tf_dict = {}
    
    for w, count in w_dict.items():
        # 計算tf的公式
        tf_dict[w] = count / float(text_seg_len)
    
    return tf_dict

def get_IDF_value(text_list, all_words):
    
    idf_dict = dict.fromkeys(all_words.keys(), 0)
    
    for text in text_list:
        for w, val in text.items():
            # 表示出現過在一次文本中         
            if val > 0:
                idf_dict[w] += 1
    
    for w, val in idf_dict.items():
        # 計算idf的公式
        idf_dict[w] = math.log(len(text_list) / float(val))
    return idf_dict

三. 計算tfidf

tf_a = get_TF_value(num_words_a, len(texta_seg))
tf_b = get_TF_value(num_words_b, len(textb_seg))

idf = get_IDF_value([num_words_a, num_words_b], num_words_a)

# 計算tfidf
tfidf_a = {}
tfidf_b = {}
for w, val in tf_a.items():
    tfidf_a[w] = val * idf[w]

for w, val in tf_b.items():
    tfidf_b[w] = val * idf[w]

tfidf_a的output如下:

{'能': 0.0,
 '來': 0.0,
 '即時': 0.0,
 '而來': 0.023104906018664842,
 '可': 0.0,
 '，': 0.0,
 '之前': 0.0,
 '模組': 0.0,
 '指令': 0.023104906018664842,
 '的': 0.0,
 '測試': 0.023104906018664842,
 '也': 0.0,
 '使用者': 0.0,
 '3': 0.023104906018664842,
 ...}

四. 用TFIDF表示成句字/文本

# 創建一個表示text a的list
bow_a = []
# 將tfidf_a帶入即可
for w, val in tfidf_a.items():
    bow_a.append(val)

print(bow_a)

bow_a最後為表示如下:

[0.0, 0.0, 0.0, 0.023104906018664842, 0.0, 0.0, 0.0, 0.0, 0.023104906018664842, 0.0, 0.023104906018664842, 0.0, 0.0, 0.023104906018664842, 0.0, 0.023104906018664842, 0.023104906018664842, 0.0, 0.023104906018664842, 0.0, 0.0, 0.023104906018664842, 0.023104906018664842, 0.0, 0.0, 0.0, 0.0, 0.023104906018664842, 0.0, 0.0, 0.023104906018664842, 0.023104906018664842, 0.023104906018664842, 0.023104906018664842, 0.0, 0.0, 0.023104906018664842, 0.023104906018664842, 0.023104906018664842, 0.0, 0.023104906018664842, 0.023104906018664842, 0.023104906018664842, 0.0, 0.023104906018664842, 0.023104906018664842, 0.0, 0.0, 0.0, 0.0, 0.023104906018664842]